In [12]:
import numpy as np
import nolearn
import sklearn.linear_model as lm
import scipy.stats as sps
import math
import pandas as pd
from decimal import Decimal
# from lasagne import layers, nonlinearities
# from lasagne.updates import nesterov_momentum
# from lasagne import layers
# from nolearn.lasagne import NeuralNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.svm import SVR
from sklearn.externals import joblib
from sklearn.utils import resample
from sklearn.preprocessing import LabelBinarizer
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [13]:
import custom_funcs as cf
cf.init_seaborn('white', 'notebook')
from isoelectric_point import isoelectric_points
from molecular_weight import molecular_weights
In [14]:
# Read in the protease inhibitor data
data, drug_cols, feat_cols = cf.read_data('hiv-protease-data.csv', n_data_cols=8)
print(len(data))
# Read in the consensus data
consensus_map = cf.read_consensus('hiv-protease-consensus.fasta')
# Clean the data
data = cf.clean_data(data, feat_cols, consensus_map)
# Identify feature columns
data = cf.drop_ambiguous_sequences(data, feat_cols)
data.dropna(inplace=True, subset=feat_cols)
data
Out[14]:
In [15]:
# Audience choice: Which drug would you like?
print(drug_cols)
DRUG = 'FPV'
In [16]:
feat_cols
Out[16]:
In [17]:
# Split data into predictor variables and dependent variables.
# Predictors are the sequence features
# Dependent are the drug resistance values
def binarize_data(data, feat_cols, DRUG):
data = resample(data)
X, Y = cf.split_data_xy(data, feat_cols, DRUG)
# Binarize the sequence features such that there are 99 x 20 columns in total.
lb = LabelBinarizer()
lb.fit(list('CHIMSVAGLPTRFYWDNEQK'))
X_binarized = pd.DataFrame()
for col in X.columns:
binarized_cols = lb.transform(X[col])
for i, c in enumerate(lb.classes_):
X_binarized[col + '_' + c] = binarized_cols[:,i]
return X_binarized, Y
X_binarized, Y = binarize_data(data, feat_cols, DRUG)
In [18]:
# View distribution of drug resistance values
import matplotlib.pyplot as plt
std = (3,3)
fig = cf.plot_Y_histogram(Y, DRUG, figsize=std)
In [19]:
# Split data into training and testing set.
tts_data = X_train, X_test, Y_train, Y_test = train_test_split(X_binarized, Y)
# Train a bunch of ensemble regressors:
## Random Forest
kwargs = {'n_jobs':-1, 'n_estimators':1000}
rfr, rfr_preds, rfr_mse, rfr_r2 = cf.train_model(*tts_data, model=RandomForestRegressor, modelargs=kwargs)
## Gradient Boosting
kwargs = {'n_estimators':1000}
gbr, gbr_preds, gbr_mse, gbr_r2 = cf.train_model(*tts_data, model=GradientBoostingRegressor, modelargs=kwargs)
## AdaBoost
kwargs = {'n_estimators':1000}
abr, abr_preds, abr_mse, abr_r2 = cf.train_model(*tts_data, model=AdaBoostRegressor, modelargs=kwargs)
## ExtraTrees
etr, etr_preds, etr_mse, etr_r2 = cf.train_model(*tts_data, model=ExtraTreesRegressor)
## Bagging
bgr, bgr_preds, bgr_mse, bgr_r2 = cf.train_model(*tts_data, model=BaggingRegressor)
# Plot the results of regression
rfr_fig = cf.scatterplot_results(rfr_preds, Y_test, rfr_mse, rfr_r2, DRUG, 'Rand. Forest', figsize=std)
# plt.savefig('figures/{0} Random Forest.pdf'.format(DRUG), bbox_inches='tight')
cf.scatterplot_results(gbr_preds, Y_test, gbr_mse, gbr_r2, DRUG, 'Grad. Boost', figsize=std)
cf.scatterplot_results(abr_preds, Y_test, abr_mse, abr_r2, DRUG, 'AdaBoost', figsize=std)
cf.scatterplot_results(etr_preds, Y_test, etr_mse, etr_r2, DRUG, 'ExtraTrees', figsize=std)
cf.scatterplot_results(bgr_preds, Y_test, bgr_mse, bgr_r2, DRUG, 'Bagging', figsize=std)
Out[19]:
In [20]:
# Grab the feature importances - that is, how important a particular feature is for predicting drug resistance
cf.barplot_feature_importances(rfr, DRUG, 'Rand. Forest')
cf.barplot_feature_importances(gbr, DRUG, 'Grad. Boost')
cf.barplot_feature_importances(abr, DRUG, 'AdaBoost')
cf.barplot_feature_importances(etr, DRUG, 'ExtraTrees')
# cf.barplot_feature_importances(bgr, DRUG, 'Bagging') ## feature_importances_ do not exist for bagging
Out[20]:
In [30]:
# Extract a table version of feature importance
rfr_fi = cf.extract_mutational_importance(rfr, X_test)
gbr_fi = cf.extract_mutational_importance(gbr, X_test)
abr_fi = cf.extract_mutational_importance(abr, X_test)
# Join data to compare random forest and gradient boosting.
# joined = rfr_fi.set_index(0).join(gbr_fi.set_index(0), lsuffix='r', rsuffix='g')
# sps.spearmanr(joined['1r'], joined['1g'])
rfr_fi
Out[30]:
In [31]:
# Train a bunch of linear model learners for comparison.
brr, brr_preds, brr_mse, brr_r2 = cf.train_model(*tts_data, model=lm.BayesianRidge)
ard, ard_preds, ard_mse, ard_r2 = cf.train_model(*tts_data, model=lm.ARDRegression)
logr, logr_preds, logr_mse, logr_r2 = cf.train_model(*tts_data, model=lm.LogisticRegression)
enr, enr_preds, enr_mse, enr_r2 = cf.train_model(*tts_data, model=lm.ElasticNet)
svr, svr_preds, svr_mse, svr_r2 = cf.train_model(*tts_data, model=SVR)
# Likewise, plot the results
cf.scatterplot_results(brr_preds, Y_test, brr_mse, brr_r2, DRUG, 'Bayesian Ridge', figsize=std)
cf.scatterplot_results(ard_preds, Y_test, ard_mse, ard_r2, DRUG, 'ARD Regression', figsize=std)
cf.scatterplot_results(logr_preds, Y_test, logr_mse, logr_r2, DRUG, 'Logistic Regression', figsize=std)
cf.scatterplot_results(enr_preds, Y_test, enr_mse, enr_r2, DRUG, 'ElasticNet', figsize=std)
cf.scatterplot_results(svr_preds, Y_test, svr_mse, svr_r2, DRUG, 'SVMs', figsize=std)
Out[31]:
In [ ]:
# Let's now try a neural network.
# Neural Network 1 Specification: Feed Forward ANN with 1 hidden layer.
x_train = X_train.astype(np.float32)
y_train = Y_train.astype(np.float32)
x_test = X_test.astype(np.float32)
y_test = Y_test.astype(np.float32)
net1 = NeuralNet(
layers=[ # three layers: one hidden layer
('input', layers.InputLayer),
('hidden1', layers.DenseLayer),
('dropout1', layers.DropoutLayer),
#('hidden2', layers.DenseLayer),
#('dropout2', layers.DropoutLayer),
('nonlinear', layers.NonlinearityLayer),
('output', layers.DenseLayer),
],
# layer parameters:
input_shape=(None, x_train.shape[1]), #
hidden1_num_units=math.ceil(x_train.shape[1] / 2), # number of units in hidden layer
hidden1_nonlinearity=nonlinearities.tanh,
dropout1_p = 0.65,
#hidden2_num_units=math.ceil(x_train.shape[1] / 2),
#dropout2_p = 0.5,
output_nonlinearity=None, # output layer uses identity function
output_num_units=1, # 30 target values
# optimization method:
update=nesterov_momentum,
update_learning_rate=0.01,
update_momentum=0.95,
regression=True, # flag to indicate we're dealing with regression problem
max_epochs=500, # we want to train this many epochs
verbose=1,
)
net1.fit(x_train.values, y_train.values)
In [ ]:
# And now let's also look at whether it looks good or not.
nn1_preds = net1.predict(x_test)
nn1_mse = float(mean_squared_error(nn1_preds, y_test))
nn1_r2 = float(sps.pearsonr(nn1_preds, y_test.reshape(y_test.shape[0],1))[0][0])
cf.scatterplot_results(nn1_preds, y_test, nn1_mse, nn1_r2, DRUG, 'Neural Net', figsize=std)
# plt.savefig('figures/{0} Neural Net.pdf'.format(DRUG), bbox_inches='tight')
In [15]:
# Save models
# Neural net
joblib.dump(net1, 'models/{0} nnet1.pkl'.format(DRUG))
# Random Forest
joblib.dump(rfr, 'models/{0} rfr.pkl'.format(DRUG))
# Gradient Boost
joblib.dump(gbr, 'models/{0} gbr.pkl'.format(DRUG))
# ExtraTrees
joblib.dump(etr, 'models/{0} etr.pkl'.format(DRUG))
Out[15]:
In [ ]:
In [ ]: